home *** CD-ROM | disk | FTP | other *** search
/ Chip: 2005 Utilities / CHIP Utilities 2005.7z / CHIP Utilities 2005.iso / docs / get-ubcd-docs.pl < prev    next >
Perl Script  |  2004-11-29  |  12KB  |  377 lines

  1. #!/usr/bin/perl 
  2. # -w
  3.  
  4. # to do
  5. #
  6. # update freesco
  7. #
  8. # make html print more fields
  9. # tighten code
  10. # perlize for greater cross platform compatability
  11. # add routine to get size of images
  12. # improve descriptions
  13. # fix link building problem that effects 3 links
  14.  
  15. # input file
  16. open(UBCDDOCS,"<ubcd-docs.csv");
  17.  
  18. # files to write
  19. open(UBCDINDEX,">ubcd-index.html");
  20. open(UBCDINDEXTABLE,">ubcd-index-table.html");
  21. open(UBCDINDEXTXT,">ubcd-index.csv");
  22. open(UBCDINDEXXML,">ubcd.xml");
  23. open(UBCDINDEXXST,">ubcd.xsl");
  24. open(UBCDINDEXDTD,">ubcd.dtd");
  25.  
  26. chdir "/docs_uncompressed";
  27.  
  28. # sort input file by utility name
  29. $srt=`sort ubcd-docs.csv > ubcd-docs.csv.sorted`;
  30. $srt=`cp -f ubcd-docs.csv.sorted ubcd-docs.csv`;
  31. $srt=`rm -f ubcd-docs.csv.sorted`;
  32.  
  33. @data_vars=(utility,doc1,doc1title,doc2,doc2title,doc3,doc3title,doc4,doc4title,doc5,doc5title,doc6,doc6title,doc7,doc7title,webpage,imagename,description,dosapp,category,menu,maintainer,lastupdate,version,size);
  34. @data_vars_enc=(url_orig,utility_enc,doc1_enc,doc1title_enc,doc2_enc,doc2title_enc,doc3_enc,doc3title_enc,doc4_enc,doc4title_enc,doc5_enc,doc5title_enc,doc6_enc,doc6title_enc,doc7_enc,doc7title_enc,webpage_enc,imagename_enc,description_enc,dosapp_enc,category_enc,menu_enc,maintainer_enc,lastupdate_enc,version_enc,size_enc);
  35.  
  36. %titles=(utility => "Utility",doc1 => "Doc #1",doc1title => "Doc #1",doc2 => "Doc #1",doc2title => "Doc #1",doc3 => "Doc #1",doc3title => "Doc #1",doc4 => "Doc #1",doc4title => "Doc #1",doc5 => "Doc #1",doc5title => "Doc #1",doc6 => "Doc #1",doc6title => "Doc #1",doc7 => "Doc #1",doc7title => "Doc #1",webpage => "Doc #1",imagename => "Doc #1",dosapp => "Doc #1",category => "Doc #1",menu => "Doc #1",maintainer => "Doc #1",lastupdate => "Doc #1",version => "Doc #1");
  37. @doc_vars=(doc1,doc2,doc3,doc4,doc5,doc6,doc7,webpage);
  38. $webpagetitle="Web Page";
  39.  
  40. &printdtd;
  41. &printxsl;
  42. &printtitles;
  43.  
  44. print "Gathering Docs\n";
  45. foreach $utility_info (<UBCDDOCS>){
  46.     chomp($utility_info);
  47.     ($utility,$doc1,$doc1title,$doc2,$doc2title,$doc3,$doc3title,$doc4,$doc4title,$doc5,$doc5title,$doc6,$doc6title,$doc7,$doc7title,$webpage,$imagename,$description,$dosapp,$category,$menu,$maintainer,$lastupdate,$version,$size)=split /,/,$utility_info;
  48.     $utility =~ s/"//g;$description =~ s/"//g;$imagename =~ s/"//g;$dosapp =~ s/"//g;$category =~ s/"//g;$menu =~ s/"//g;$maintainer =~ s/"//g;$lastupdate =~ s/"//g;$size =~ s/"//g;$version =~ s/"//g;
  49.     chomp($imagename);
  50.     $imagename_fixed=lc $imagename;
  51.     $file_test="$imagename_fixed".".igz";
  52.     print "looking for [$file_test]\n";
  53.     if (-f "/mnt/disk/images/$file_test"){
  54.         $imagename_fixed="$imagename_fixed".".igz";
  55.     }else{
  56.         $imagename_fixed="$imagename_fixed".".img";    
  57.     }
  58.     
  59.     chomp($size=`du -k /mnt/disk/images/$imagename_fixed|cut -f1`);
  60.     if ($imagename !~ m/[a-z|A-Z]/ or $imagename =~ m/Utility/){next;};
  61.  
  62.     print "\n\n\nGetting docs for [$imagename|$dosapp]\n";
  63.     mkdir $imagename;
  64.     chdir "$imagename";
  65.     print UBCDINDEX qq!$utility   $description   !;
  66.     print UBCDINDEXTXT qq!$utility, $description, !;
  67.     print UBCDINDEXTABLE qq!<tr><td>$utility</td><td>$description</td>!;
  68.  
  69.     &printutilinfo;
  70.  
  71.     foreach $doc_var (@doc_vars){    
  72.         if ($$doc_var =~ m/[a-z|A-Z]/){
  73.             # remove quotes
  74.             $$doc_var =~ s/"//g;
  75.             $doc_url = $$doc_var;
  76.             
  77.             # define other doc vars
  78.             $doc_title_var = "$doc_var" . "title";
  79.  
  80.             # remove quotes
  81.             $$doc_title_var =~ s/"//g;
  82.  
  83.             print "Fetching [$$doc_var]\n";
  84.             system(qq!wget --continue --tries=1 --html-extension --convert-links --page-requisites --user-agent="Mozilla/4.0 (compatable; MSIE 6.0; Windows NT 5.1)" "$$doc_var"!);
  85.             if ($$doc_var =~ m/.pdf$|.PDF$/){
  86.                 $url_orig=$$doc_var;
  87.                 # convert PDFs to html
  88.                 $$doc_var =~ s/http:\//$imagename/g;
  89.                 print "PDF CONVERSION";
  90.                 $cvtoutput=`pdftotext -layout -htmlmeta -eol unix -nopgbrk "/docs_uncompressed/$$doc_var" 2>&1`;
  91.                 print " [$cvtoutput:/docs_uncompressed/$$doc_var]\n";
  92.                 unlink "/docs_uncompressed/$$doc_var";
  93.  
  94.                 # change converted PDF extension to html
  95.                 $$doc_var =~ s/.pdf$|.PDF$/.html/;
  96.     
  97.                 # fix document name & location to be loadable via a browser.
  98.                 $tmp_doc_var = $$doc_var;
  99.                 $$doc_var =~ s/\?|=|&|\+//g;
  100.                 if ($$doc_var ne $tmp_doc_var){
  101.                     chdir "/docs_uncompressed";
  102.                     rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var");
  103.                     print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n";
  104.                     chdir "/docs_uncompressed";
  105.                 }
  106.                 
  107.                 # write indexes
  108.                 &encode_xml_data;
  109.                 print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var [html]</A>   !;
  110.                 print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !;
  111.                 print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!;
  112.                 &printdoc;
  113.             }else{
  114.                 $url_orig=$$doc_var;
  115.                 # change http: to file:
  116.                 $$doc_var =~ s/http:\//$imagename/g;
  117.  
  118.                 # if the document is a html type document as defined by wget ensure it ends in .html
  119.                 if ($$doc_var !~ m/.aspx$|.htm$|.html$|.faq$|.FAQ$|.lsm$|.txt$|.doc$/ and $$doc_var !~ m/\/$/){$$doc_var = $$doc_var . ".html"};
  120.  
  121.                 # fix document name & location to be loadable via a browser.
  122.                 $tmp_doc_var = $$doc_var;
  123.                 $$doc_var =~ s/\?|=|&|\+//g;
  124.                 if ($$doc_var ne $tmp_doc_var){
  125.                     chdir "/docs_uncompressed";
  126.                     rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var");
  127.                     print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n";
  128.                     chdir "/docs_uncompressed";
  129.                 }
  130.  
  131.                 # write indexes
  132.                 &encode_xml_data;
  133.                 print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var</A>   !;
  134.                 print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !;
  135.                 print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!;
  136.                 &printdoc;
  137.             }
  138.         }
  139.     }
  140.     print UBCDINDEX qq!<BR>\n!;
  141.     print UBCDINDEXTXT qq!\n!;
  142.     print UBCDINDEXTABLE qq!</tr>\n!;
  143.     print UBCDINDEXXML qq!</utility_info>\n\n!;
  144.     chdir "/docs_uncompressed";
  145. }
  146.  
  147. print UBCDINDEX qq!</body></html>\n!;
  148. print UBCDINDEXTXT qq!End\n!;
  149. print UBCDINDEXTABLE qq!</table></body></html>\n!;
  150. print UBCDINDEXXML qq!</catalog>\n!;
  151.  
  152. chomp($check=`du -bs /docs_uncompressed`);
  153. print "\n\nDocs RAW Size [$check]\n\n";
  154.  
  155. print "Running html tidy on html files\n";
  156. system('find . -name *.html -o -name *.htm -print -exec tidy -modify -upper -quiet -omit -errors {} \; > /dev/null 2>&1');
  157. chomp($check=`du -bs /docs_uncompressed`);
  158. print "Docs after HTML Tidy Size [$check]\n\n";
  159.  
  160. print "Compressing docs_uncompressed to /cmp\n";
  161. $rm_old=`rm -rf /cmp /docs`;
  162. $pack_result=`webpack -b /cmp/`;
  163. chomp($check=`du -bs /cmp`);
  164. print "/cmp compressed Size [$check]\n\n";
  165.  
  166. print "Moving /cmp to /docs\n";
  167. $move=`mv -f /cmp /docs`;
  168.  
  169. print "Archiving docs\n";
  170. $tgz=`tar -czf /docs.tar.gz /docs`;
  171. chomp($tgz_size=`du -bs /docs.tar.gz`);
  172. print "Archive size [$tgz_size]\n\n";
  173.  
  174. print "Done\n\n";
  175. close;
  176. exit;
  177.  
  178. sub encode_xml_data{
  179. $utility_enc=$utility;
  180. $doc1_enc=$doc1;
  181. $doc1title_enc=$doc1title;
  182. $doc2_enc=$doc2;
  183. $doc2title_enc=$doc2title;
  184. $doc3_enc=$doc3;
  185. $doc3title_enc=$doc3title;
  186. $doc4_enc=$doc4;
  187. $doc4title_enc=$doc4title;
  188. $doc5_enc=$doc5;
  189. $doc5title_enc=$doc5title;
  190. $doc6_enc=$doc6;
  191. $doc6title_enc=$doc6title;
  192. $doc7_enc=$doc7;
  193. $doc7title_enc=$doc7title;
  194. $webpage_enc=$webpage;
  195. $imagename_enc=$imagename;
  196. $description_enc=$description;
  197. $dosapp_enc=$dosapp;
  198. $category_enc=$category;
  199. $menu_enc=$menu;
  200. $maintainer_enc=$maintainer;
  201. $lastupdate_enc=$lastupdate;
  202. $version_enc=$version;
  203. $size_enc=$size;
  204.  
  205. foreach $data (@data_vars_enc){ 
  206.     $$data =~ s/</</g;
  207.     $$data =~ s/&/&/g;
  208.     $$data =~ s/>/>/g;
  209.     $$data =~ s/"/"/g;
  210.     $$data =~ s/'/'/g;
  211. }
  212. }
  213.  
  214. sub printtitles{
  215. # print document titles
  216. print UBCDINDEX <<EOF;
  217. <html><head><title>UBCD CD Based Docs - HTML</title></head><body>
  218. <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a>   <a href="ubcd-index-table.html">HTML Table</a>   <a href="ubcd.xml">XML</a>   <a href="ubcd-index.csv">CSV</a><BR><BR>
  219. <html><head><title>UBCD CD Based Docs - HTML</title></head><body>
  220. UTILITY   DESCRIPTION   DOCUMENTS<BR>
  221. EOF
  222.  
  223. print UBCDINDEXTABLE <<EOF;
  224. <html><head><title>UBCD CD Based Docs - HTML Table</title></head><body>
  225. <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a>    <a href="ubcd-index-table.html">HTML Table</a>    <a href="ubcd.xml">XML</a>    <a href="ubcd-index.csv">CSV</a><BR><BR>
  226. <table border="1">
  227. <TR><TD>UTILITY</TD><TD>DESCRIPTION</TD><TD>DOCUMENTS</TD></TR>
  228. EOF
  229.  
  230. print UBCDINDEXTXT qq!UBCD CD Based Docs - Text Listing - CSV\n!;
  231. print UBCDINDEXTXT qq!UTILITY,DESCRIPTION,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,WEBPAGE,\n!;
  232. print UBCDINDEXXML <<EOF;
  233. <?xml version="1.0"?>
  234. <?xml-stylesheet type="text/xsl" href="ubcd.xsl"?>
  235. <!DOCTYPE catalog SYSTEM "ubcd.dtd">
  236. <catalog>
  237. EOF
  238. }
  239.  
  240. sub printutilinfo{
  241. &encode_xml_data;
  242. print UBCDINDEXXML <<EOF;
  243. <utility_info>
  244.     <utility>$utility_enc</utility>
  245.     <description>$description_enc</description>
  246.     <imagename>$imagename_enc</imagename>
  247.     <dosapp>$dosapp_enc</dosapp>
  248.     <category>$category_enc</category>
  249.     <menu>$menu_enc</menu>
  250.     <maintainer>$maintainer_enc</maintainer>
  251.     <lastupdate>$lastupdate_enc</lastupdate>
  252.     <size>$size_enc</size>
  253.     <version>$version_enc</version>
  254. EOF
  255. }
  256.  
  257. sub printdoc{
  258.     $$doc_title_var =~ s/</</g;
  259.     $$doc_title_var =~ s/&/&/g;
  260.     $$doc_title_var =~ s/>/>/g;
  261.     $$doc_title_var =~ s/"/"/g;
  262.     $$doc_title_var =~ s/'/'/g;
  263.  
  264.     $$doc_var =~ s/</</g;
  265.     $$doc_var =~ s/&/&/g;
  266.     $$doc_var =~ s/>/>/g;
  267.     $$doc_var =~ s/"/"/g;
  268.     $$doc_var =~ s/'/'/g;
  269.  
  270.     $$doc_url =~ s/</</g;
  271.     $$doc_url =~ s/&/&/g;
  272.     $$doc_url =~ s/>/>/g;
  273.     $$doc_url =~ s/"/"/g;
  274.     $$doc_url =~ s/'/'/g;
  275.  
  276. print UBCDINDEXXML <<EOF;
  277.     <doc>
  278.         <title>$$doc_title_var</title>
  279.         <location>$$doc_var</location>
  280.         <url>$url_orig</url>
  281.     </doc>
  282. EOF
  283. }
  284.  
  285. sub printdtd{
  286. # ubcd.dtd
  287. print UBCDINDEXDTD <<EOF;
  288. <!ELEMENT catalog (utility_info*)>
  289. <!ELEMENT utility_info (utility, description, imagename, dosapp?, category, menu, maintainer, lastupdate, size?, version, doc*)>
  290. <!ELEMENT utility (#PCDATA)>
  291. <!ELEMENT description (#PCDATA)>
  292. <!ELEMENT imagename (#PCDATA)>
  293. <!ELEMENT dosapp (#PCDATA)>
  294. <!ELEMENT category (#PCDATA)>
  295. <!ELEMENT menu (#PCDATA)>
  296. <!ELEMENT maintainer (#PCDATA)>
  297. <!ELEMENT lastupdate (#PCDATA)>
  298. <!ELEMENT size (#PCDATA)>
  299. <!ELEMENT version (#PCDATA)>
  300. <!ELEMENT doc (title, location, url)>
  301. <!ELEMENT title (#PCDATA)>
  302. <!ELEMENT location (#PCDATA)>
  303. <!ELEMENT url (#PCDATA)>
  304. EOF
  305. # end ubcd.dtd
  306. }
  307.  
  308. sub printxsl{
  309. # ubcd.xsl
  310. print UBCDINDEXXST <<EOF;
  311. <?xml version="1.0"?>
  312. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  313. <xsl:output method="html" encoding="UTF-8"/>
  314.  
  315. <xsl:template match="/">
  316. <html><head><title>UBCD CD Based Docs - XML</title></head>
  317. <body>
  318. <p><b>UBCD CD based documentation</b></p>
  319. <ol>
  320. <xsl:apply-templates mode="TOC"/>
  321. </ol>
  322. <xsl:apply-templates mode="body"/>
  323. </body>
  324. </html>
  325. </xsl:template>
  326.  
  327. <xsl:template match="utility_info" mode="TOC">
  328. <li><a href="{concat('#utility', position())}"><xsl:value-of
  329. select="utility/text()"/></a></li>
  330. </xsl:template>
  331.  
  332. <xsl:template match="utility_info" mode="body">
  333. <p><a name="{concat('utility', position())}"><xsl:value-of
  334. select="text()"/></a></p>
  335. <xsl:apply-templates select="utility"/>
  336. <xsl:text> </xsl:text>
  337. <xsl:apply-templates select="version"/>
  338. <br></br>
  339. <xsl:apply-templates select="description"/>
  340. <br></br>
  341. <xsl:apply-templates select="lastupdate"/>
  342. <xsl:apply-templates select="doc"/>
  343. </xsl:template>
  344.  
  345. <xsl:template match="utility"><b><xsl:value-of
  346. select="text()"/></b></xsl:template>
  347.  
  348. <xsl:template match="description"><xsl:value-of select="text()"/></xsl:template>
  349.  
  350. <!-- this is a way to handle docs. please uncomment this and comment
  351. the doc template under it
  352. <xsl:template match="doc">
  353. <br></br><a href="{url}"><xsl:value-of select="title"/></a>
  354. </xsl:template>
  355. -->
  356.  
  357. <xsl:template match="doc">
  358. <br></br>
  359. <xsl:text>[documentation] </xsl:text>
  360. <xsl:value-of select="title"/>
  361. <xsl:text>: </xsl:text>
  362. <a href="{url}">web</a>
  363. <xsl:text> </xsl:text>
  364. <a href="{location}">local</a>
  365. </xsl:template>
  366.  
  367.  
  368. <xsl:template match="version">Version: <xsl:value-of
  369. select="text()"/></xsl:template>
  370. <xsl:template match="lastupdate">Last Updated: <xsl:value-of
  371. select="text()"/></xsl:template>
  372.  
  373. </xsl:stylesheet>
  374. EOF
  375. # end ubcd.xsl
  376. }
  377.